//  TorusGamesGPUFunctions.metal
//
//	© 2021 by Jeff Weeks
//	See TermsOfUse.txt

#include <metal_stdlib>
using namespace metal;
#include "TorusGamesGPUDefinitions.h"


//	Apple's Metal Shading Language specs
//
//		https://developer.apple.com/metal/metal-shading-language-specification.pdf
//
//	give the following advice for choosing between the "constant" and "device" address spaces:
//
//		NOTE: To decide which address space (device or constant), a read-only buffer
//		passed to a graphics or kernel function should use, look at how the buffer
//		is accessed inside the graphics or kernel function. The constant address space
//		is optimized for multiple instances executing a graphics or kernel function
//		accessing the same location in the buffer. Some examples of this access pattern
//		are accessing light or material properties for lighting / shading, ... .
//		If multiple executing instances of a graphics or kernel function are accessing the buffer
//		using an index such as the vertex ID, fragment coordinate, or the thread position in grid,
//		the buffer should be allocated in the device address space.
//
//	but...
//
//		For buffers in the device address space, the offset must be aligned
//		to the data type consumed by the vertex shader function
//		(which is always less than or equal to 16 bytes).
//
//		For buffers in the constant address space, the offset must be aligned
//		to 256 bytes in macOS. In iOS, the offset must be aligned to the maximum
//		of either the data type consumed by the vertex shader function, or 4 bytes.
//		A 16-byte alignment is always safe in iOS if you do not need to worry
//		about the data type.
//
//	So apps that pass individual fields of a struct (for example, a single color)
//	as a separate GPU function argument will run into trouble on macOS if that
//	individual field isn't 256-byte aligned.  To work around this problem,
//	we may sometimes want to pass an argument using
//
//		the constant address space on iOS, but
//		the  device  address space on macOS.
//
//	This lets the iOS app enjoy whatever small performance advantage
//	the constant address space offers, while the macOS app enjoys
//	the device address space's more lenient alignment requirements.
//
//	To allow conditional compilation, we could in principle use function constants,
//	but we'd need two versions of each argument
//
//		constant float	&fooForIOS		[[ buffer(0), function_constant(gCompileForIOS)   ]],
//		device float	&fooForMacOS	[[ buffer(0), function_constant(gCompileForMacOS) ]],
//
//	along with a few lines of code to select between them
//
//		float	foo;
//
//		if (gCompileForIOS)
//			foo = fooForIOS;
//		if (gCompileForMacOS)
//			foo = fooForMacOS;
//
//	Such an approach would work correctly, but would turn clean simple GPU functions
//	into messy ugly GPU functions.
//
//	If we could #define some constants, we could instead write code like
//
//		#ifdef COMPILE_SHADERS_FOR_IOS
//			constant
//		#endif
//		#ifdef COMPILE_SHADERS_FOR_MACOS
//			device
//		#endif
//				float	&foo	[[ buffer(0) ]],
//
//	which would minimize the mess, but I don't know how to provide
//	those platform-specific #defined constants (at least not without
//	manually compiling the Metal code, which again gets messy).
//
//	So for now, let's just go with the device address space on iOS as well as macOS,
//	and not worry about the (hopefully negligible) performance penalty.
//
//	I've defined the affected arguments as "ConstantOrDevice",
//	so we at least have some record of which arguments we'd like
//	to put in the constant address space, if only we could do so
//	without making the code too messy.
//
#define ConstantOrDevice	device const


constant bool	gClipToFrameCell		[[ function_constant(0) ]];
constant bool	gDrawSlice				[[ function_constant(1) ]];
constant bool	gDrawSolid = ! gDrawSlice;
constant bool	gClipSliceToWinLine		[[ function_constant(2) ]];
constant bool	gMakeGameChoiceIcons	[[ function_constant(3) ]];


struct VertexInput2DPlain
{
	float2	pos [[ attribute(VertexAttribute2DPosition) ]];
};

struct VertexInput2DWithTexture
{
	float2	pos [[ attribute(VertexAttribute2DPosition)	]],
			tex	[[ attribute(VertexAttribute2DTexCoords)]];
};

struct VertexInput2DOffsettableWithTexture
{
	float2	pos [[ attribute(VertexAttribute2DPosition)	]],
			tex	[[ attribute(VertexAttribute2DTexCoords)]];
	float	off	[[ attribute(VertexAttribute2DOffset)	]];	//	∈ {0.0, 1.0} according to whether the vertex
															//		shouldn't or should be offset
};


struct VertexInput3DWall
{
	float3	pos [[ attribute(VertexAttribute3DWallPosition) ]];
	float2	tex	[[ attribute(VertexAttribute3DWallTexCoords)]];
	float2	wgt	[[ attribute(VertexAttribute3DWallWeight)	]];
};

struct VertexInput3DPolyhedron
{
	float3	pos [[ attribute(VertexAttribute3DPolyhedronPosition)	]];
	half3	nor	[[ attribute(VertexAttribute3DPolyhedronNormal)		]];
};


struct VertexOutput2DPlain
{
	float4	position		[[ position			]];
	float	clipDistance	[[ clip_distance	]] [4];
};

struct VertexOutput2DTexCoords
{
	float4	position		[[ position			]];
	float	clipDistance	[[ clip_distance	]] [4];
	float2	texCoords		[[ user(texcoords)	]];
};


struct VertexOutput3DWall
{
	float4	position		[[ position			]];
	half4	color			[[ user(color)		]];	//	premultiplied alpha
	float2	texCoords		[[ user(texcoords)	]];
};

struct VertexOutput3DPolyhedron
{
	float4	position		[[ position			]];
	half4	color			[[ user(color)		]];	//	premultiplied alpha
	float	clipDistance	[[ clip_distance	]] [8];
};


struct FragmentInput2DTexCoords
{
	float2	texCoords		[[ user(texcoords)	]];
};

struct FragmentInput3DWall
{
	half4	color			[[ user(color)		]];	//	premultiplied alpha
	float2	texCoords		[[ user(texcoords)	]];
};

struct FragmentInput3DPolyhedron
{
	half4	color			[[ user(color)		]];	//	premultiplied alpha
};


#pragma mark -
#pragma mark 2D vertex functions

vertex VertexOutput2DTexCoords TorusGamesVertexFunction2DBackground(
	VertexInput2DWithTexture						in							[[ stage_in										]],
	ConstantOrDevice float							&textureRepetitions			[[ buffer(BufferIndexVFMisc)					]],
	const device TorusGames2DCoveringTransformation	*coveringTransformations	[[ buffer(BufferIndexVFCoveringTransformations)	]],
	constant TorusGames2DWorldData					&world						[[ buffer(BufferIndexVFWorldData)				]],
	ushort											iid							[[ instance_id									]]	)
{
	VertexOutput2DTexCoords	out;
	
	out.position = float4((	world.itsDragPlacement
						  * coveringTransformations[iid]
						  * float3(in.pos, 1.0)
					).xy, 0.0, 1.0);

	out.clipDistance[0]	=  out.position.x + world.itsClippingDistance;
	out.clipDistance[1]	= -out.position.x + world.itsClippingDistance;
	out.clipDistance[2]	=  out.position.y + world.itsClippingDistance;
	out.clipDistance[3]	= -out.position.y + world.itsClippingDistance;
	
	out.texCoords = textureRepetitions * in.tex;

	return out;
}

vertex VertexOutput2DPlain TorusGamesVertexFunction2DSpritePlain(
	VertexInput2DPlain									in							[[ stage_in										]],
	ConstantOrDevice TorusGames2DSpritePlacementMatrix	&spritePlacement			[[ buffer(BufferIndexVFPlacement)				]],
	device const TorusGames2DCoveringTransformation		*coveringTransformations	[[ buffer(BufferIndexVFCoveringTransformations)	]],
	constant TorusGames2DWorldData						&world						[[ buffer(BufferIndexVFWorldData)				]],
	ushort												iid							[[ instance_id									]]	)
{
	VertexOutput2DPlain	out;
	
	out.position = float4((	world.itsDragPlacement
						  * coveringTransformations[iid]
						  * spritePlacement
						  * float3(in.pos, 1.0)
					).xy, 0.0, 1.0);

	out.clipDistance[0]	=  out.position.x + world.itsClippingDistance;
	out.clipDistance[1]	= -out.position.x + world.itsClippingDistance;
	out.clipDistance[2]	=  out.position.y + world.itsClippingDistance;
	out.clipDistance[3]	= -out.position.y + world.itsClippingDistance;

	return out;
}

vertex VertexOutput2DTexCoords TorusGamesVertexFunction2DSpriteWithTexture(
	VertexInput2DWithTexture							in							[[ stage_in										]],
	ConstantOrDevice TorusGames2DSpritePlacementMatrix	&spritePlacement			[[ buffer(BufferIndexVFPlacement)				]],
	device const TorusGames2DCoveringTransformation		*coveringTransformations	[[ buffer(BufferIndexVFCoveringTransformations)	]],
	constant TorusGames2DWorldData						&world						[[ buffer(BufferIndexVFWorldData)				]],
	ushort												iid							[[ instance_id									]]	)
{
	VertexOutput2DTexCoords	out;
	
	out.position = float4((	world.itsDragPlacement
						  * coveringTransformations[iid]
						  * spritePlacement
						  * float3(in.pos, 1.0)
					).xy, 0.0, 1.0);

	out.clipDistance[0]	=  out.position.x + world.itsClippingDistance;
	out.clipDistance[1]	= -out.position.x + world.itsClippingDistance;
	out.clipDistance[2]	=  out.position.y + world.itsClippingDistance;
	out.clipDistance[3]	= -out.position.y + world.itsClippingDistance;
	
	out.texCoords = in.tex;

	return out;
}

vertex VertexOutput2DTexCoords TorusGamesVertexFunction2DLine(
	VertexInput2DWithTexture							in							[[ stage_in										]],
	ConstantOrDevice float								&textureRepetitions			[[ buffer(BufferIndexVFMisc)					]],
	ConstantOrDevice TorusGames2DSpritePlacementMatrix	&spritePlacement			[[ buffer(BufferIndexVFPlacement)				]],
	device const TorusGames2DCoveringTransformation		*coveringTransformations	[[ buffer(BufferIndexVFCoveringTransformations)	]],
	constant TorusGames2DWorldData						&world						[[ buffer(BufferIndexVFWorldData)				]],
	ushort												iid							[[ instance_id									]]	)
{
	VertexOutput2DTexCoords	out;
	
	out.position = float4((	world.itsDragPlacement
						  * coveringTransformations[iid]
						  * spritePlacement
						  * float3(in.pos, 1.0)
					).xy, 0.0, 1.0);

	out.clipDistance[0]	=  out.position.x + world.itsClippingDistance;
	out.clipDistance[1]	= -out.position.x + world.itsClippingDistance;
	out.clipDistance[2]	=  out.position.y + world.itsClippingDistance;
	out.clipDistance[3]	= -out.position.y + world.itsClippingDistance;
	
	out.texCoords[0] = textureRepetitions * in.tex[0];	//	texture repeats along the line's length
	out.texCoords[1] =                      in.tex[1];	//	... never across its breadth.

	return out;
}

vertex VertexOutput2DTexCoords TorusGamesVertexFunction2DLineWithEndcaps(
	VertexInput2DOffsettableWithTexture					in							[[ stage_in										]],
	ConstantOrDevice TorusGames2DSpritePlacementMatrix	&spritePlacement			[[ buffer(BufferIndexVFPlacement)				]],
	device const TorusGames2DCoveringTransformation		*coveringTransformations	[[ buffer(BufferIndexVFCoveringTransformations)	]],
	constant TorusGames2DWorldData						&world						[[ buffer(BufferIndexVFWorldData)				]],
	ushort												iid							[[ instance_id									]]	)
{
	float					tmpFlipFactor;
	float3x3				tmpStretchedSpritePlacement;
	VertexOutput2DTexCoords	out;
	
	//	If we think of itsSpritePlacement as a matrix
	//
	//		( a  c  e )
	//		( b  d  f )
	//		( 0  0  1 )
	//
	//	written related to the right-to-left matrix convention,
	//	then (a,b) is the image of the line's long axis
	//	while (c,d) is the image of its short axis.
	//	For ordinary vertices we may use that matrix as is,
	//	but for "offset" vertices we need to extend the long axis
	//	in proportion to the short axis.  The required adjustment
	//	to the matrix is
	//
	//		( Δa, Δb ) = (-d,  c )
	//	or
	//		( Δa, Δb ) = ( d, -c )
	//
	//	according to whether the image is or isn't flipped.
	//
	tmpFlipFactor						= sign(spritePlacement[0][0] * spritePlacement[1][1]
											 - spritePlacement[0][1] * spritePlacement[1][0]);
	tmpStretchedSpritePlacement			= spritePlacement;
	tmpStretchedSpritePlacement[0][0]	+= in.off * tmpFlipFactor * spritePlacement[1][1];
	tmpStretchedSpritePlacement[0][1]	-= in.off * tmpFlipFactor * spritePlacement[1][0];
	
	out.position = float4((	world.itsDragPlacement
						  * coveringTransformations[iid]
						  * tmpStretchedSpritePlacement
						  * float3(in.pos, 1.0)
					).xy, 0.0, 1.0);

	out.clipDistance[0]	=  out.position.x + world.itsClippingDistance;
	out.clipDistance[1]	= -out.position.x + world.itsClippingDistance;
	out.clipDistance[2]	=  out.position.y + world.itsClippingDistance;
	out.clipDistance[3]	= -out.position.y + world.itsClippingDistance;
	
	out.texCoords = in.tex;

	return out;
}

vertex VertexOutput2DTexCoords TorusGamesVertexFunction2DSpriteWithTextureSubset(
	VertexInput2DWithTexture							in							[[ stage_in										]],
	ConstantOrDevice TorusGames2DSpritePlacementMatrix	&spritePlacement			[[ buffer(BufferIndexVFPlacement)				]],
	ConstantOrDevice float4								&texturePlacement			[[ buffer(BufferIndexVFMisc)					]],
															//	= ( u_min, v_min, u_max - u_min, v_max - v_min ) in bottom-up image
	device const TorusGames2DCoveringTransformation		*coveringTransformations	[[ buffer(BufferIndexVFCoveringTransformations)	]],
	constant TorusGames2DWorldData						&world						[[ buffer(BufferIndexVFWorldData)				]],
	ushort												iid							[[ instance_id									]]	)
{
	VertexOutput2DTexCoords	out;
	
	out.position = float4((	world.itsDragPlacement
						  * coveringTransformations[iid]
						  * spritePlacement
						  * float3(in.pos, 1.0)
					).xy, 0.0, 1.0);

	out.clipDistance[0]	=  out.position.x + world.itsClippingDistance;
	out.clipDistance[1]	= -out.position.x + world.itsClippingDistance;
	out.clipDistance[2]	=  out.position.y + world.itsClippingDistance;
	out.clipDistance[3]	= -out.position.y + world.itsClippingDistance;

	out.texCoords = (float2)
					{
						texturePlacement[0] + texturePlacement[2] * in.tex[0],
						texturePlacement[1] + texturePlacement[3] * in.tex[1]
					};

	return out;
}


#pragma mark -
#pragma mark 2D fragment functions

fragment half4 TorusGamesFragmentFunction2DTexture(
	FragmentInput2DTexCoords	in				[[ stage_in					]],
	texture2d<half>				texture			[[ texture(TextureIndexFF)	]],
	sampler						textureSampler	[[ sampler(SamplerIndexFF)	]])
{
	half4	tmpTexelColor;	//	premultiplied alpha

	tmpTexelColor = texture.sample(textureSampler, in.texCoords);

	return tmpTexelColor;	//	premultiplied alpha
}

fragment half4 TorusGamesFragmentFunction2DColor(
	ConstantOrDevice half4	&color	[[ buffer(BufferIndexFFMisc)	]])	//	premultiplied alpha
{
	return color;	//	premultiplied alpha
}

fragment half4 TorusGamesFragmentFunction2DColorWithRGBATexture(
	FragmentInput2DTexCoords	in				[[ stage_in					]],
	ConstantOrDevice half4		&color			[[ buffer(BufferIndexFFMisc)]],	//	premultiplied alpha
	texture2d<half>				texture			[[ texture(TextureIndexFF)	]],
	sampler						textureSampler	[[ sampler(SamplerIndexFF)	]])
{
	half4	tmpTexelColor,	//	premultiplied alpha
			tmpTintedColor;	//	premultiplied alpha

	tmpTexelColor	= texture.sample(textureSampler, in.texCoords);
	tmpTintedColor	= color * tmpTexelColor;

	return tmpTintedColor;	//	premultiplied alpha
}

fragment half4 TorusGamesFragmentFunction2DColorWithMask(
	FragmentInput2DTexCoords	in				[[ stage_in					]],
	ConstantOrDevice half4		&color			[[ buffer(BufferIndexFFMisc)]],	//	premultiplied alpha
	texture2d<half>				texture			[[ texture(TextureIndexFF)	]],	//	mask is in red channel, not alpha channel; see note below
	sampler						textureSampler	[[ sampler(SamplerIndexFF)	]])
{
	half4	tmpMaskColor,
			tmpTintedColor;	//	premultiplied alpha

	//	Conceptually the texture is an alpha texture, but we use
	//	MTLPixelFormatR8Unorm instead of MTLPixelFormatA8Unorm
	//	because the former is "shader writable" while the latter is not,
	//	and thus supports GPU mipmap generation.
	
	tmpMaskColor	= texture.sample(textureSampler, in.texCoords);
	tmpTintedColor	= color * tmpMaskColor.r;	//	tmpMaskColor.r gives the opacity

	return tmpTintedColor;	//	premultiplied alpha
}

fragment half4 TorusGamesFragmentFunction2DGrid(
	FragmentInput2DTexCoords	in						[[ stage_in					]],
	ConstantOrDevice float		&transitionWidthInverse	[[ buffer(BufferIndexFFMisc)]])	//	inverse of transition zone width
{
	float2	tmpTransparencyFactors;
	float	tmpOpacity;

	const float cntHalfWidth = 1.0/64.0;	//	half width of grid lines, excluding the transition zones,
											//		as a fraction of the cell size

	tmpTransparencyFactors	= clamp(transitionWidthInverse * (abs(fract(in.texCoords) - 0.5) - cntHalfWidth), 0.0, 1.0);
	tmpOpacity				= 1.0 - tmpTransparencyFactors[0] * tmpTransparencyFactors[1];
	
	return half4(0.0h, 0.0h, 0.0h, half(tmpOpacity));	//	premultiplied alpha
}


#pragma mark -
#pragma mark 3D wall vertex function

vertex VertexOutput3DWall TorusGamesVertexFunction3DWall(
	VertexInput3DWall				in				[[ stage_in								]],
	ConstantOrDevice float4x4		&placement		[[ buffer(BufferIndexVFPlacement)		]],
	ConstantOrDevice half4			&color			[[ buffer(BufferIndexVFMisc)			]],
	ConstantOrDevice float2			&texCoordShift	[[ buffer(BufferIndexVFTexCoordShift)	]],
	constant TorusGames3DWorldData	&world			[[ buffer(BufferIndexVFWorldData)		]] )
{
	VertexOutput3DWall	out;
	float3				tmpOuterPosition,		//	in wall's local coordinates
						tmpInnerPosition,		//	in wall's local coordinates
						tmpWeightedPosition;	//	in wall's local coordinates
	float4				tmpWorldPosition;		//	in world coordinates
	float2				tmpOuterTexCoords,
						tmpInnerTexCoords,
						tmpWeightedTexCoords;
	float3				tmpLightDirection,
						tmpWallNormal;
	half				tmpDiffuseFactor,
						tmpBrightness;
	
	const float3	cntWallCenterPosition	= float3( 0.0,  0.0,  0.0 );		//	in wall's local coordinates
	const float2	cntWallCenterTexCoords	= float2( 0.5,  0.5 );
	const float3	cntLightPosition		= float3( 0.0,  0.0, -1.0 );
	const float4	cntWallNormal			= float4( 0.0,  0.0, -1.0,  0.0 );	//	note w = 0 for a direction vector

	//	I'm writing this code for clarity rather than efficiency.
	//	It's not at all time-critical (there'll be only a few wall vertices per frame!)
	//	so we might as well make the code as clear as possible for a human reader.

	//	position

	tmpOuterPosition	= in.pos;
	tmpInnerPosition	= world.itsAperture[0] * tmpOuterPosition
						+ world.itsAperture[1] * cntWallCenterPosition;

	tmpWeightedPosition	= in.wgt[0] * tmpOuterPosition
						+ in.wgt[1] * tmpInnerPosition;

	tmpWorldPosition	= world.itsFrameCellIntoWorld
						* placement
						* float4(tmpWeightedPosition, 1.0);

	out.position		= world.itsProjection
						* tmpWorldPosition;

	//	texture coordinates

	tmpOuterTexCoords		= in.tex;
	tmpInnerTexCoords		= world.itsAperture[0] * tmpOuterTexCoords
							+ world.itsAperture[1] * cntWallCenterTexCoords;

	tmpWeightedTexCoords	= in.wgt[0] * tmpOuterTexCoords
							+ in.wgt[1] * tmpInnerTexCoords;

	out.texCoords			= tmpWeightedTexCoords + texCoordShift;

	//	point light
	//
	//		Note:  On extended surfaces like the walls,
	//		a point light provides better looking results
	//		than a directional light would.
	//
	tmpLightDirection	= normalize(cntLightPosition - tmpWorldPosition.xyz);
	tmpWallNormal		= (
							  world.itsFrameCellIntoWorld
							* placement
							* cntWallNormal
						  ).xyz;
	tmpDiffuseFactor	= 0.5h + 0.5h * abs( half(dot(tmpLightDirection, tmpWallNormal)) );	//	use abs() to light backfaces
	
	//	final color
	tmpBrightness	= tmpDiffuseFactor;	//	no fog for the walls
	out.color		= half4(tmpBrightness, tmpBrightness, tmpBrightness, 1.0h) * color;

	return out;
}


#pragma mark -
#pragma mark 3D wall fragment functions

fragment half4 TorusGamesFragmentFunction3DWallPlain(
	FragmentInput3DWall	in	[[ stage_in	]])
{
	return in.color;	//	premultiplied alpha
}

fragment half4 TorusGamesFragmentFunction3DWallReflection(
	FragmentInput3DWall	in	[[ stage_in	]])
{
	float	tmpWrappedSCoord;
	half4	tmpColor;		//	premultiplied alpha

	const half4	cntShadingColor	= half4( 1.00h,  0.75h,  0.75h, 1.00h );
	
	//	Wrap the s-coordinate to [0,1).
	tmpWrappedSCoord = fract(in.texCoords[0]);

	//	Leave the color unchanged on half of the wall (s < 1/2)
	//	while reddening it slightly on the other half (s ≥ 1/2).
	tmpColor = in.color;
	if (tmpWrappedSCoord >= 0.5)
		tmpColor *= cntShadingColor;

	return tmpColor;	//	premultiplied alpha
}

fragment half4 TorusGamesFragmentFunction3DWallQuarterTurn(
	FragmentInput3DWall		in				[[ stage_in					]],
	ConstantOrDevice float	&parityFactor	[[ buffer(BufferIndexFFMisc)]] )
{
	float2	tmpWrappedCoords,
			tmpOffsetCoords;
	float	tmpProduct;
	half4	tmpColor;		//	premultiplied alpha

	const half4	cntShadingColor	= half4( 0.875h,  0.875h,  0.875h, 1.000h );

	//	Wrap to the standard square [0,1) × [0,1) .
	tmpWrappedCoords = fract(in.texCoords);
	
	//	Offset both coordinates so that the wall's center sits at (0,0).
	tmpOffsetCoords = tmpWrappedCoords - float2(0.5, 0.5);
	
	//	To get the pinwheel effect, multiply together four factors,
	//	each of which changes sign along one of the four desired color-change lines,
	//	along with an external parityFactor, and then...
	tmpProduct =  tmpOffsetCoords[0]
				* tmpOffsetCoords[1]
				* (tmpOffsetCoords[0] - tmpOffsetCoords[1])
				* (tmpOffsetCoords[0] + tmpOffsetCoords[1])
				* parityFactor;

	//	...use the sign of the product to slightly darken four of the pinwheel's sectors.
	tmpColor = in.color;
	if (tmpProduct >= 0.0)
		tmpColor *= cntShadingColor;

	return tmpColor;	//	premultiplied alpha
}

fragment half4 TorusGamesFragmentFunction3DWallHalfTurn(
	FragmentInput3DWall		in				[[ stage_in					]],
	ConstantOrDevice float	&parityFactor	[[ buffer(BufferIndexFFMisc)]] )
{
	float2	tmpWrappedCoords,
			tmpOffsetCoords;
	float	tmpProduct;
	half4	tmpColor;		//	premultiplied alpha

	const half4	cntShadingColor	= half4( 0.875h,  0.875h,  0.875h, 1.000h );

	//	Wrap to the standard square [0,1) × [0,1) .
	tmpWrappedCoords = fract(in.texCoords);
	
	//	Offset both coordinates so that the wall's center sits at (0,0).
	tmpOffsetCoords = tmpWrappedCoords - float2(0.5, 0.5);
	
	//	To get the quadrant effect, multiply together two factors,
	//	each of which changes sign along one of the two desired color-change lines,
	//	along with an external parityFactor, and then...
	tmpProduct =  tmpOffsetCoords[0]
				* tmpOffsetCoords[1]
				* parityFactor;
	
	//	...use the sign of the product to slightly darken two of the quadrants.
	tmpColor = in.color;
	if (tmpProduct >= 0.0)
		tmpColor *= cntShadingColor;

	return tmpColor;	//	premultiplied alpha
}


#pragma mark -
#pragma mark 3D polyhedron vertex functions

vertex VertexOutput3DPolyhedron TorusGamesVertexFunction3DPolyhedron(
	VertexInput3DPolyhedron									in							[[ stage_in																			]],
	ConstantOrDevice TorusGames3DPolyhedronPlacementAsSIMD	&placementInGameCell		[[ buffer(BufferIndexVFPlacement),					function_constant(gDrawSolid)	]],
	ConstantOrDevice TorusGames3DPolyhedronPlacementAsSIMD	&placementInFrameCell		[[ buffer(BufferIndexVFPlacement),					function_constant(gDrawSlice)	]],
	device const TorusGames3DCoveringTransformation			*coveringTransformations	[[ buffer(BufferIndexVFCoveringTransformations),	function_constant(gDrawSolid)	]],
	constant TorusGames3DWorldData							&world						[[ buffer(BufferIndexVFWorldData)													]],
	ConstantOrDevice half4									&solidColor					[[ buffer(BufferIndexVFMisc),						function_constant(gDrawSolid)	]],
	ushort													iid							[[ instance_id,										function_constant(gDrawSolid)	]]	)
{
	VertexOutput3DPolyhedron	out;
	float4						tmpPositionInTiling,
								tmpPositionInFrameCell,
								tmpPositionInWorld;
	float						tmpRadialDistance,
								tmpDistanceAlongTicTacToeWinLineTube;
	half						tmpFogFactor;
	float3						tmpNormalInWorld;
	half						tmpDiffuseFactor,
								tmpBrightness;
	half4						tmpColor;

	const float3	cntLightDirection	= float3(0.0, 0.0, -1.0);
	const half4		cntSliceColor		= half4(0.25h, 0.25h, 0.25h, 1.00h);

	//	position
	
	if (gDrawSolid)
	{
		tmpPositionInTiling		= coveringTransformations[iid]
								* placementInGameCell.itsIsometricPlacement
								* float4(in.pos * placementInGameCell.itsDilation, 1.0);

		if (gClipToFrameCell)	//	ViewBasicLarge
		{
			tmpPositionInFrameCell	= world.itsTilingIntoFrameCell
									* tmpPositionInTiling;

			tmpPositionInWorld		= world.itsFrameCellIntoWorld
									* tmpPositionInFrameCell;
		}
		else					//	ViewRepeating
		{
			tmpPositionInWorld		= world.itsTilingIntoWorld
									* tmpPositionInTiling;
		}
	}
	else	//	gDrawSlice
	{
		tmpPositionInFrameCell	= placementInFrameCell.itsIsometricPlacement
								* float4(in.pos * placementInFrameCell.itsDilation, 1.0);

		tmpPositionInWorld		= world.itsFrameCellIntoWorld
								* tmpPositionInFrameCell;
	}

	out.position	= world.itsProjection
					* tmpPositionInWorld;
	

	//	The radial distance from the camera at (0,0,-1) to the current vertex,
	//	in world coordinates, will be useful for both clipping and fogging.
	//
	tmpRadialDistance = sqrt( tmpPositionInWorld.x * tmpPositionInWorld.x
							+ tmpPositionInWorld.y * tmpPositionInWorld.y
							+ (tmpPositionInWorld.z + 1.0) * (tmpPositionInWorld.z + 1.0));

	//	clipping
	
	if (gClipToFrameCell)	//	ViewBasicLarge
	{
		//	Clip all triangles to lie within the frame cell,
		//	which is a cube with corners at (±1/2, ±1/2, ±1/2).
		//
		//	Metal clips to the intersection of the half-spaces
		//
		//		clipDistance[i] ≥ 0.0
		//
		out.clipDistance[0]	=  tmpPositionInFrameCell.x + 0.5;
		out.clipDistance[1]	= -tmpPositionInFrameCell.x + 0.5;
		out.clipDistance[2]	=  tmpPositionInFrameCell.y + 0.5;
		out.clipDistance[3]	= -tmpPositionInFrameCell.y + 0.5;
		out.clipDistance[4]	=  tmpPositionInFrameCell.z + 0.5;
		out.clipDistance[5]	= -tmpPositionInFrameCell.z + 0.5;
	}
	else					//	ViewRepeating
	{
		//	Radial clipping
		
		out.clipDistance[0]	= world.itsClippingRadiusForRepeatingView - tmpRadialDistance;

		out.clipDistance[1]	= 1.0;
		out.clipDistance[2]	= 1.0;
		out.clipDistance[3]	= 1.0;
		out.clipDistance[4]	= 1.0;
		out.clipDistance[5]	= 1.0;
	}
	
	if (gClipSliceToWinLine)
	{
		//	Clip to the interval
		//
		//		-1.0 ≤ tmpDistanceAlongTicTacToeWinLineTube ≤ +1.0
		//
		
		tmpDistanceAlongTicTacToeWinLineTube = dot(	tmpPositionInFrameCell,
													placementInFrameCell.itsExtraClippingCovector);
		
		out.clipDistance[6]	=  tmpDistanceAlongTicTacToeWinLineTube + 1.0;
		out.clipDistance[7]	= -tmpDistanceAlongTicTacToeWinLineTube + 1.0;
	}
	else
	{
		out.clipDistance[6]	= 1.0;
		out.clipDistance[7]	= 1.0;
	}

	//	fog
	tmpFogFactor = clamp((world.itsFogEnd - half(tmpRadialDistance)) * world.itsFogScale, 0.0h, 1.0h);

	//	directional light
	//
	//		Note:  Even though the walls use a point light,
	//		a directional light works fine here.  As well as
	//		being a little cheaper to compute, it avoid any possibility
	//		of ever running into trouble trying to normalize
	//		a light direction vector of length zero.
	//
	if (gDrawSolid)
	{
		tmpNormalInWorld	= (
								  world.itsTilingIntoWorld
								* coveringTransformations[iid]
								* placementInGameCell.itsIsometricPlacement
								* float4(float3(in.nor), 0.0)	//	ignore itsDilation
							  ).xyz;
	}
	else	//	gDrawSlice
	{
		tmpNormalInWorld	= (
								  world.itsFrameCellIntoWorld
								* placementInFrameCell.itsIsometricPlacement
								* float4(float3(in.nor), 0.0)	//	ignore itsDilation
							  ).xyz;

	}
	tmpDiffuseFactor = 0.5h + 0.5h * half(dot(cntLightDirection, tmpNormalInWorld));
	
	//	final color

	if (gDrawSolid)
		tmpColor = solidColor;
	else
		tmpColor = cntSliceColor;

	tmpBrightness	= tmpFogFactor * tmpDiffuseFactor;

	out.color		= half4(tmpBrightness, tmpBrightness, tmpBrightness, 1.0h) * tmpColor;	//	premultiplied alpha
	
	return out;
}


#pragma mark -
#pragma mark 3D polyhedron fragment function

fragment half4 TorusGamesFragmentFunction3DPolyhedron(
	FragmentInput3DPolyhedron	in	[[ stage_in	]])
{
	return in.color;	//	premultiplied alpha
}


#pragma mark -
#pragma mark Compute functions

kernel void TorusGamesComputeFunctionMakeTicTacToeGrid(
	texture2d<half, access::write>	aTexture	[[ texture(TextureIndexCFImage)	]],
	constant ushort2				&someLimits	[[ buffer(BufferIndexCFMisc)	]],
	constant half4					&aColor		[[ buffer(BufferIndexCFColor)	]],
	ushort2							aGridID		[[ thread_position_in_grid		]])
{
	half4	theColor;
	
	theColor = (   (aGridID.x <  someLimits[0]	//	part of  left  border
				 || aGridID.x >= someLimits[1]	//	part of  right border
				 || aGridID.y <  someLimits[0]	//	part of bottom border
				 || aGridID.y >= someLimits[1])	//	part of  top   border
				?
				 aColor
				:
				 half4(0.0h, 0.0h, 0.0h, 0.0h)	//	fully transparent
			   );

	aTexture.write(theColor, aGridID);
}

kernel void TorusGamesComputeFunctionMakeGomokuGrid(
	texture2d<half, access::write>	aTexture	[[ texture(TextureIndexCFImage)	]],
	constant ushort2				&someLimits	[[ buffer(BufferIndexCFMisc)	]],
	constant half4					&aColor		[[ buffer(BufferIndexCFColor)	]],
	ushort2							aGridID		[[ thread_position_in_grid		]])
{
	half4	theColor;
	
	theColor = (
				(	(aGridID.x >= someLimits[0] && aGridID.x < someLimits[1])	//	part of  vertical  grid line
				 || (aGridID.y >= someLimits[0] && aGridID.y < someLimits[1])	//	part of horizontal grid line
				)
				?
				 aColor
				:
				 half4(0.0, 0.0, 0.0, 0.0)	//	fully transparent
			   );

	aTexture.write(theColor, aGridID);
}

kernel void TorusGamesComputeFunctionMakeMazeMask(
	texture2d<half, access::write>	aTexture		[[ texture(TextureIndexCFImage)	]],
	constant MazeMaskParameters		&someParmeters	[[ buffer(BufferIndexCFMisc)	]],
	ushort2							aGridID			[[ thread_position_in_grid		]])
{
	ushort	theOffset;
	ushort2	theOffsetPixel,
			theCell,
			theLocalPixel;	//	the pixel's location in the cell's own local coordinates
	//	theWalls holds meaningful data only its four lowest-order bits.
	//	Nevertheless we declare it as a ushort rather than a uint8_t,
	//	for the reason explained in the MazeMaskParameters struct
	//	in TorusGamesGPUDefinitions.h .  The GPU uses 16-bit registers,
	//	so treating theWalls as a 16-bit value makes sense in any case.
	ushort	theWalls;
	bool2	theCellEdgeIsNearby,
			theWallIsPresentAndNearby;
	float2	theCellEdgeDistance;	//	measured from pixel center to cell's edge line
	half4	theColor;

	//	Locate the cell to which the pixel belongs
	//	and also its relative coordinates within that cell.
	
	//	The texture origin (0,0) sits at the center of a maze cell,
	//	so to get started let's offset the pixel by half a cell width
	//	in each direction.
	theOffset		= someParmeters.itsPixelsPerCell / 2;
	theOffsetPixel	= aGridID + ushort2(theOffset, theOffset);

	//	Performance note:  Because itsPixelsPerCell is a power of two,
	//	we could replace the division by a bitshift and the modulus by a mask,
	//	if performance were an issue, but it's not.  The Maze mask
	//	gets recomputed only when the user starts a new maze.
	//
	theCell			= theOffsetPixel / someParmeters.itsPixelsPerCell;
	theLocalPixel	= theOffsetPixel % someParmeters.itsPixelsPerCell;
	
	//	Because of theOffset, pixels near the top of the board
	//	will end up in the bottom row of cells, and pixels near
	//	the board's right side will end up in the leftmost column of cells.
	//
	if (theCell.y >= someParmeters.itsCellsPerRow)	//	test for '>' too, just to be safe
	{
		theCell.y = 0;
		if (someParmeters.itsKleinBottleFlag)
		{
			theCell.x		= someParmeters.itsCellsPerRow - theCell.x;
			theLocalPixel.x	= (someParmeters.itsPixelsPerCell - 1) - theLocalPixel.x;
		}
	}
	if (theCell.x >= someParmeters.itsCellsPerRow)	//	test for '>' too, just to be safe
	{
		theCell.x = 0;
	}
	
	//	Locate the flags saying which walls are present.
	theWalls = someParmeters.itsWalls[theCell.x][theCell.y];
	
	//	Compute the pixel's coordinates relative to the nearest edges of the enclosing cell.

	//		west or east wall
	if (theLocalPixel.x <  someParmeters.itsLineHalfWidth)
	{
		theCellEdgeIsNearby.x		= true;
		theWallIsPresentAndNearby.x	= ((theWalls & 0x0008) != 0);	//	west wall
		theCellEdgeDistance.x		= float(theLocalPixel.x) + 0.5;	//	measured from pixel center
	}
	else
	if (theLocalPixel.x >= someParmeters.itsPixelsPerCell - someParmeters.itsLineHalfWidth)
	{
		theCellEdgeIsNearby.x		= true;
		theWallIsPresentAndNearby.x	= ((theWalls & 0x0002) != 0);	//	east wall
		theCellEdgeDistance.x		= someParmeters.itsPixelsPerCell - theLocalPixel.x - 0.5;	//	measured from pixel center
	}
	else
	{
		theCellEdgeIsNearby.x		= false;
		theWallIsPresentAndNearby.x	= false;
		theCellEdgeDistance.x		= 0.0;
	}

	//		north or south wall
	if (theLocalPixel.y <  someParmeters.itsLineHalfWidth)
	{
		theCellEdgeIsNearby.y		= true;
		theWallIsPresentAndNearby.y	= ((theWalls & 0x0004) != 0);	//	south wall
		theCellEdgeDistance.y		= float(theLocalPixel.y) + 0.5;	//	measured from pixel center
	}
	else
	if (theLocalPixel.y >= someParmeters.itsPixelsPerCell - someParmeters.itsLineHalfWidth)
	{
		theCellEdgeIsNearby.y		= true;
		theWallIsPresentAndNearby.y	= ((theWalls & 0x0001) != 0);	//	north wall
		theCellEdgeDistance.y		= someParmeters.itsPixelsPerCell - theLocalPixel.y - 0.5;	//	measured from pixel center
	}
	else
	{
		theCellEdgeIsNearby.y		= false;
		theWallIsPresentAndNearby.y	= false;
		theCellEdgeDistance.y		= 0.0;
	}

	//	Is the pixel part of a wall?
	if (theWallIsPresentAndNearby.x
	 || theWallIsPresentAndNearby.y)
	{
		theColor = half4(1.0, 1.0, 1.0, 1.0);	//	white
	}
	else
	if	//	Is the pixel part of a corner?  (Corners are always present.)
	(
		theCellEdgeIsNearby.x
	 && theCellEdgeIsNearby.y
	 && (
	 		//	Draw each corner (that's not part of a wall)
	 		//	as a quarter of a disk.
	 		//
	 		//		Note:  In principle we could antialias
	 		//		the quarter-disk's circumference,
	 		//		by letting theColor's brightness
	 		//		be proportional to what fraction
	 		//		of the pixel's area lies within the disk.
	 		//		In practice this probably isn't necessary,
	 		//		given that
	 		//
	 		//		- in many cases one of the (non-base) mipmap levels
	 		//			will get used,
	 		//
	 		//		- the GPU fragment function will interpolate
	 		//			the texel colors when sampling, and
	 		//
	 		//		- the Maze will will be pretty thin in any case.
	 		//
			theCellEdgeDistance.x * theCellEdgeDistance.x
		  + theCellEdgeDistance.y * theCellEdgeDistance.y
		  < someParmeters.itsLineHalfWidth * someParmeters.itsLineHalfWidth
		)
	)
	{
		theColor = half4(1.0, 1.0, 1.0, 1.0);	//	white
	}
	else	//	The pixel lies in the cell's interior.
	{
		theColor = half4(0.0, 0.0, 0.0, 1.0);	//	black
	}

	aTexture.write(theColor, aGridID);
}

kernel void TorusGamesComputeFunctionMakeMazeMaskLEGACY(
	texture2d<half, access::write>	aTexture		[[ texture(TextureIndexCFImage)	]],
	constant MazeMaskParameters		&someParmeters	[[ buffer(BufferIndexCFMisc)	]],
	uint2							aGridID			[[ thread_position_in_grid		]])
{
	//	According to WWDC 2016 #606, the A8 GPU and later
	//	use 16-bit register units, so it's best to use
	//	half and short for arithmetic whenever possible.
	//
	//	The A7 GPU is willing to run code written for half and short,
	//	but its computations on short2 and ushort2 are extremely buggy.
	//	To work around those bugs, this legacy function is
	//	just like the GPU function
	//
	//		TorusGamesComputeFunctionMakeMazeMask()
	//
	//	given immediately above, but with all 16-bit shorts
	//	converted to 32-bit ints.
	//
	//	The comments given in the non-legacy version
	//	apply equally well here.

	uint	theOffset;
	uint2	theOffsetPixel,
			theCell,
			theLocalPixel;
	uint	theWalls;
	bool2	theCellEdgeIsNearby,
			theWallIsPresentAndNearby;
	float2	theCellEdgeDistance;
	half4	theColor;

	theOffset		= someParmeters.itsPixelsPerCell / 2;
	theOffsetPixel	= aGridID + uint2(theOffset, theOffset);

	theCell			= theOffsetPixel / someParmeters.itsPixelsPerCell;
	theLocalPixel	= theOffsetPixel % someParmeters.itsPixelsPerCell;
	
	if (theCell.y >= someParmeters.itsCellsPerRow)
	{
		theCell.y = 0;
		if (someParmeters.itsKleinBottleFlag)
		{
			theCell.x		= someParmeters.itsCellsPerRow - theCell.x;
			theLocalPixel.x	= (someParmeters.itsPixelsPerCell - 1) - theLocalPixel.x;
		}
	}
	if (theCell.x >= someParmeters.itsCellsPerRow)
	{
		theCell.x = 0;
	}
	
	theWalls = someParmeters.itsWalls[theCell.x][theCell.y];
	
	if (theLocalPixel.x <  someParmeters.itsLineHalfWidth)
	{
		theCellEdgeIsNearby.x		= true;
		theWallIsPresentAndNearby.x	= ((theWalls & 0x00000008) != 0);
		theCellEdgeDistance.x		= float(theLocalPixel.x) + 0.5;
	}
	else
	if (theLocalPixel.x >= someParmeters.itsPixelsPerCell - someParmeters.itsLineHalfWidth)
	{
		theCellEdgeIsNearby.x		= true;
		theWallIsPresentAndNearby.x	= ((theWalls & 0x00000002) != 0);
		theCellEdgeDistance.x		= someParmeters.itsPixelsPerCell - theLocalPixel.x - 0.5;
	}
	else
	{
		theCellEdgeIsNearby.x		= false;
		theWallIsPresentAndNearby.x	= false;
		theCellEdgeDistance.x		= 0.0;
	}

	if (theLocalPixel.y <  someParmeters.itsLineHalfWidth)
	{
		theCellEdgeIsNearby.y		= true;
		theWallIsPresentAndNearby.y	= ((theWalls & 0x00000004) != 0);
		theCellEdgeDistance.y		= float(theLocalPixel.y) + 0.5;
	}
	else
	if (theLocalPixel.y >= someParmeters.itsPixelsPerCell - someParmeters.itsLineHalfWidth)
	{
		theCellEdgeIsNearby.y		= true;
		theWallIsPresentAndNearby.y	= ((theWalls & 0x00000001) != 0);
		theCellEdgeDistance.y		= someParmeters.itsPixelsPerCell - theLocalPixel.y - 0.5;
	}
	else
	{
		theCellEdgeIsNearby.y		= false;
		theWallIsPresentAndNearby.y	= false;
		theCellEdgeDistance.y		= 0.0;
	}

	if (theWallIsPresentAndNearby.x
	 || theWallIsPresentAndNearby.y)
	{
		theColor = half4(1.0, 1.0, 1.0, 1.0);	//	white
	}
	else
	if
	(
		theCellEdgeIsNearby.x
	 && theCellEdgeIsNearby.y
	 && (
	 		theCellEdgeDistance.x * theCellEdgeDistance.x
		  + theCellEdgeDistance.y * theCellEdgeDistance.y
		  < someParmeters.itsLineHalfWidth * someParmeters.itsLineHalfWidth
		)
	)
	{
		theColor = half4(1.0, 1.0, 1.0, 1.0);
	}
	else
	{
		theColor = half4(0.0, 0.0, 0.0, 1.0);
	}

	aTexture.write(theColor, aGridID);
}

kernel void TorusGamesComputeFunctionMakeJigsawCollages(
	texture2d<half, access::read>	aSourceImage			[[ texture(TextureIndexCFJigsawPuzzleSourceImage)		]],
	texture2d<half, access::read>	aPieceTemplate			[[ texture(TextureIndexCFJigsawPieceTemplate)			]],
	texture2d<half, access::write>	aCollageWithoutBorders	[[ texture(TextureIndexCFJigsawCollageWithoutBorders)	]],
	texture2d<half, access::write>	aCollageWithBorders		[[ texture(TextureIndexCFJigsawCollageWithBorders)		]],
	constant JigsawCollageLayout	&aCollageLayout			[[ buffer(BufferIndexCFMisc)							]],
	ushort2							aGridID					[[ thread_position_in_grid								]])
{
	ushort2	theBlock;	//	Each collage comprises an n×n set of blocks, with each block
						//		containing a single piece of the Jigsaw puzzle
						//		along with some surrounding pixels.
						//		So, for example, in a 3×3 puzzle, theBlock = (2,1)
						//		would refer to the block in column 2, row 1.
	short2	theSrcCoordsRawPxSigned;	//	Signed coordinates of the desired "source pixel"
										//		in a "repeating view" of the aSourceImage
	ushort2	theSrcCoordsRawPx,			//	The desired source pixel with coordinates in {0, …, s-1}
			theSrcCoordsShiftedPx,		//		same, after phase shift
			theSrcCoordsWrappedPx;		//		same, after wrapping
	half4	theBorderlessCollagePixelColorAsRescaledP3,
			theBorderlessCollagePixelColor,
			theBorderedCollagePixelColor;
	float2	thePuzzleCoords,
			thePieceCoords;
	uchar4	thePieceTabs;
	bool	thePieceHasTabOnSideH,
			thePieceHasTabOnSideV;
	float	h,
			v;
	ushort	hh,
			vv;
	ushort2	theTemplateMaxCoords,
			theTexelCoordsH,
			theTexelCoordsV;
	half4	theTexelColorH,
			theTexelColorV;
	bool	theTransparencyH,
			theTransparencyV,
			theBrightnessH,
			theBrightnessV;

	half4x4 gP3toSRGB =
	{
		{ 1.2249401762805587, -0.0420569547096881, -0.0196375545903344, 0.0},
		{-0.2249401762805597,  1.0420569547096874, -0.0786360455506319, 0.0},
		{ 0.0000000000000001,  0.0000000000000000,  1.0982736001409661, 0.0},
		{ 0.0,                 0.0,                 0.0,                1.0}
	};

 	//	All coordinates run left-to-right in the first coordinate,
 	//	and bottom-to-top in the second coordinate.


	//	Basic pixel color
	
	//	Each pixel in the collage belong to some extended block
	//	that includes a puzzle piece and its surrounding pixels.
	//	Which puzzle piece does the given collage pixel belong to?
	//
	//		Note #1:  itsApproxBlockStride is an integer approximation
	//		to the true block stride (which is typically not an integer).
	//		But that's OK:  we neededn't worry about pixels near
	//		the boundary between two blocks, because such pixels
	//		never get drawn, no matter which of the nearby blocks
	//		we assign them to.
	//
	//		Note #2:  itsApproxBlockStride may be a fractional pixel
	//		larger (but never smaller) than the true block stride.
	//		So we always get block coordinates in the valid range [0, n-1].
	//
	theBlock = aGridID / aCollageLayout.itsApproxBlockStride;	//	fractional part intentionally discarded

	//	What are the coordinates of the corresponding pixel in aSourceImage?
	theSrcCoordsRawPxSigned = short2(aGridID)
							- short2(aCollageLayout.itsInitialOffsetPx
								+  theBlock * aCollageLayout.itsSubsequentOffsetPx);
	
	//	Wrap to the "fundamental square".
	//	In other words, wrap to put each coordinates into the range {0, …, s-1}.
	//
	//		When wrapping the vertical coordinate in a Klein bottle image,
	//		be sure to flip the horizontal coordinate at the same time.
	//		(For greatest efficiency, aCollageLayout.itsKleinBottleFlag
	//		could be replaced by a function constant, but in practice
	//		I doubt that will be necessary.)
	//
	theSrcCoordsRawPx = ( ushort2(theSrcCoordsRawPxSigned) & aCollageLayout.itsCoordinateMask );
	if ( aCollageLayout.itsKleinBottleFlag && (short(theSrcCoordsRawPx.y) != theSrcCoordsRawPxSigned.y) )
		theSrcCoordsRawPx.x = aCollageLayout.itsCoordinateMask - theSrcCoordsRawPx.x;

	//	Apply the phase shift
	//
	//		Note:  In a Klein bottle, the horizontal phase shift
	//		will always be either 0 or s/2, to respect
	//		the drawing's “glide reflection axes”.
	//
	theSrcCoordsShiftedPx = theSrcCoordsRawPx + aCollageLayout.itsPhaseShift;

	//	Again wrap the coordinates to the fundamental domain.
	//
	//		In a torus we could work in signed coordinates and
	//		wrap only once, rather than at two points in the code.
	//		But in a Klein bottle we'd need to allow for the possibility
	//		that the phase shift takes a pixel not to the nearest
	//		neighboring copy of the fundamental square, but to the copy
	//		after that one (which isn't mirror reversed).  That wouldn't
	//		be difficult to handle, but the current organization
	//		seems simpler.
	//
	theSrcCoordsWrappedPx = ( theSrcCoordsShiftedPx & aCollageLayout.itsCoordinateMask );
	if (aCollageLayout.itsKleinBottleFlag && (theSrcCoordsWrappedPx.y != theSrcCoordsShiftedPx.y))
		theSrcCoordsWrappedPx.x = aCollageLayout.itsCoordinateMask - theSrcCoordsWrappedPx.x;

	//	Read the source pixel color.
	theBorderlessCollagePixelColorAsRescaledP3 = aSourceImage.read(theSrcCoordsWrappedPx);
	
#if defined(__METAL_IOS__)

	//	To its credit, iOS uses sRGB color coordinates
	//	with complete consistency, with extended-range
	//	coordinates allowing for wide color.
	//	Unfortunately ASTC texture compression
	//	doesn't support extended-range color coordinates.
	//	But because ASTC is such a wonderfully efficient way
	//	to keep the size of the Jigsaw Puzzles images small
	//	(and thus keep Torus Games' overall download size small)
	//	I devised to the following trick to use it with wide-color images:
	//	I took the puzzle images in Display P3 and used
	//	Apple's Color Sync utility to "Assign" an sRGB profile,
	//	in effect artificially re-interpreting the P3 color values
	//	as sRGB values, to take advantage of the Asset Catalog's
	//	automatic ASTC encoding.  (If we had instead passed
	//	the P3 image directly, I'm pretty sure the Asset Catalog
	//	would have clamped the colors to the sRGB gamut,
	//	rather than scaling the whole color space linearly.)
	//	Here we must undo that trick, and re-interpret
	//	the pixel color as Display P3, which we may immediately
	//	convert to extended-range sRGB for consistency
	//	with iOS's color-space convention.
	//
	theBorderlessCollagePixelColor = gP3toSRGB * theBorderlessCollagePixelColorAsRescaledP3;

#elif defined(__METAL_MACOS__)

	//	On macOS, no rescaling is needed.
	//	The (non-extended!) color coordinates will get interpreted
	//	as a Display P3 color, because that's Torus Games working color space.
	//
	theBorderlessCollagePixelColor = theBorderlessCollagePixelColorAsRescaledP3;

#else
	//	Unexpected platform
	theBorderlessCollagePixelColor = half4(1.0h, 1.0h, 0.0h, 1.0h);
#endif
	

	//	For one-time personal use in creating a Jigsaw Puzzle icon.
	//	Colors are given in linear Display P3.
	if (gMakeGameChoiceIcons)
	{
		if (theBlock.x == 0 && theBlock.y == 0)
			theBorderlessCollagePixelColor = (half4) {1.000h, 0.000h, 0.125h, 1.000h};
		else
		if (theBlock.x == 1 && theBlock.y == 0)
			theBorderlessCollagePixelColor = (half4) {0.000h, 0.625h, 1.000h, 1.000h};
		else
		if (theBlock.x == 1 && theBlock.y == 1)
			theBorderlessCollagePixelColor = (half4) {0.000h, 1.000h, 0.625h, 1.000h};
		else
			theBorderlessCollagePixelColor = (half4) {1.000h, 0.000h, 1.000h, 1.000h};
	}

	
	//	Opacity and border
	
	//	The plan here is to switch to a coordinate system
	//	in which we can easily read opacity values
	//	and black border pixels from aPieceTemplate.
	
	//	Convert theSrcCoordsRawPxSigned from pixel coordinates,
	//	which take integer values in the range {0, …, s-1} × {0, …, s-1},
	//	to "puzzle coordinates" with floating-point values
	//	in the range [0.0, n] × [0.0, n].
	//
	//		Note:  We want the location of the pixel's center
	//		(not its lower-left corner) so we must add (0.5, 0.5)
	//		to the pixel coordinates before converting.
	//
	thePuzzleCoords = (float2(theSrcCoordsRawPxSigned) + float2(0.5, 0.5))
					* aCollageLayout.itsPieceWidthsPerPixel;
	
	//	"Piece coordinates" are just like "puzzle coordinates",
	//	but with the origin at the center of the current piece.
	thePieceCoords = thePuzzleCoords - (float2(0.5, 0.5) + float2(theBlock));
	
	//	Note whether the nearby sides of the puzzle piece
	//	have slots or tabs.
	thePieceTabs = aCollageLayout.itsPieceTabs[theBlock.x][theBlock.y];
	thePieceHasTabOnSideH = thePieceTabs[thePieceCoords.x >= 0.0 ? 1 /* east  */ : 0 /* west  */];
	thePieceHasTabOnSideV = thePieceTabs[thePieceCoords.y >= 0.0 ? 3 /* north */ : 2 /* south */];

	//	Let
	//
	//		h be the pixel's non-negative horizontal coordinate,
	//			which may run leftward or rightward,
	//	and
	//		v be the pixel's non-negative vertical coordinate,
	//			which may run downward or upward.
	//
	h = abs(thePieceCoords.x);
	v = abs(thePieceCoords.y);
	
	//	Convert h and v to the piece template's texel grid.
	//
	//		Note:  We multiply by itsPieceTemplateSize.x
	//		in both the horizontal and vertical directions.
	//		In effect we're pretending that aPieceTemplate
	//		is twice as tall as it really is.
	//		We'll clamp the vertical texture coordinate
	//		to stay within the texel grid provided.
	//
	hh = ushort(h * float(aCollageLayout.itsPieceTemplateSize.x));	//	fractional part discarded
	vv = ushort(v * float(aCollageLayout.itsPieceTemplateSize.x));	//	fractional part discarded

	//	Note:  The clamping in aPieceTemplate's x direction is
	//	only to allow for a few stray pixels near the boundary,
	//	but the clamping in its y direction affects all texels
	//	that lie beyond the puzzle piece's nominal edge.
	theTemplateMaxCoords = ushort2(
							aCollageLayout.itsPieceTemplateSize.x - ushort(1),
							aCollageLayout.itsPieceTemplateSize.y - ushort(1));
	
	//	Locate the texel for the shape of the piece's left or right side.
	theTexelCoordsH = ushort2(
		min(hh, theTemplateMaxCoords.x),  //	marginal clamping
		min(vv, theTemplateMaxCoords.y)); //	lots of clamping
	if ( ! thePieceHasTabOnSideH )
		theTexelCoordsH.x = theTemplateMaxCoords.x - theTexelCoordsH.x;
	theTexelColorH = aPieceTemplate.read(theTexelCoordsH);
	if (thePieceHasTabOnSideH)
	{
		theTransparencyH	= theTexelColorH.g;	//	green component
		theBrightnessH		= theTexelColorH.r;	//	 red  component
	}
	else
	{
		theTransparencyH	= theTexelColorH.r;	//	 red  component
		theBrightnessH		= theTexelColorH.g;	//	green component
	}

	//	Swap the roles of hh and vv to locate the texel
	//	for the shape of the piece's bottom or top side.
	theTexelCoordsV = ushort2(
		min(vv, theTemplateMaxCoords.x),  //	marginal clamping
		min(hh, theTemplateMaxCoords.y)); //	lots of clamping
	if ( ! thePieceHasTabOnSideV )
		theTexelCoordsV.x = theTemplateMaxCoords.x - theTexelCoordsV.x;
	theTexelColorV = aPieceTemplate.read(theTexelCoordsV);
	if (thePieceHasTabOnSideV)
	{
		theTransparencyV	= theTexelColorV.g;	//	green component
		theBrightnessV		= theTexelColorV.r;	//	 red  component
	}
	else
	{
		theTransparencyV	= theTexelColorV.r;	//	 red  component
		theBrightnessV		= theTexelColorV.g;	//	green component
	}
	
	//	Is the pixel opaque?
	if (theTransparencyH == 0.0 && theTransparencyV == 0.0)
	{
		//	The pixel is opaque.
		//	theBorderlessCollagePixelColor is fine as it is.
		
		//	Is the pixel bright? or part of the black border in the bordered collage?
		if (theBrightnessH != 0.0 && theBrightnessV != 0.0)
			theBorderedCollagePixelColor = theBorderlessCollagePixelColor;
		else
			theBorderedCollagePixelColor = half4(0.0h, 0.0h, 0.0h, 1.0h);	//	opaque black
	}
	else	//	The pixel is transparent, in both borderless and bordered collages
	{
		theBorderlessCollagePixelColor	= half4(0.0h, 0.0h, 0.0h, 0.0h);	//	fully transparent
		theBorderedCollagePixelColor	= half4(0.0h, 0.0h, 0.0h, 0.0h);	//	fully transparent
	}
	
	//	Write the texel colors to the two collages.
	aCollageWithoutBorders.write(theBorderlessCollagePixelColor, aGridID);
	aCollageWithBorders.write(theBorderedCollagePixelColor, aGridID);
}

kernel void TorusGamesComputeFunctionMakeJigsawCollagesLEGACY(
	texture2d<half, access::read>	aSourceImage			[[ texture(TextureIndexCFJigsawPuzzleSourceImage)		]],
	texture2d<half, access::read>	aPieceTemplate			[[ texture(TextureIndexCFJigsawPieceTemplate)			]],
	texture2d<half, access::write>	aCollageWithoutBorders	[[ texture(TextureIndexCFJigsawCollageWithoutBorders)	]],
	texture2d<half, access::write>	aCollageWithBorders		[[ texture(TextureIndexCFJigsawCollageWithBorders)		]],
	constant JigsawCollageLayout	&aCollageLayout			[[ buffer(BufferIndexCFMisc)							]],
	uint2							aGridID					[[ thread_position_in_grid								]])
{
	uint2	theBlock;
	int2	theSrcCoordsRawPxSigned;
	uint2	theSrcCoordsRawPx,
			theSrcCoordsShiftedPx,
			theSrcCoordsWrappedPx;
	half4	theBorderlessCollagePixelColorAsRescaledP3,
			theBorderlessCollagePixelColor,
			theBorderedCollagePixelColor;
	float2	thePuzzleCoords,
			thePieceCoords;
	uchar4	thePieceTabs;
	bool	thePieceHasTabOnSideH,
			thePieceHasTabOnSideV;
	float	h,
			v;
	uint	hh,
			vv;
	uint2	theTemplateMaxCoords,
			theTexelCoordsH,
			theTexelCoordsV;
	half4	theTexelColorH,
			theTexelColorV;
	bool	theTransparencyH,
			theTransparencyV,
			theBrightnessH,
			theBrightnessV;

	half4x4 gP3toSRGB =
	{
		{ 1.2249401762805587, -0.0420569547096881, -0.0196375545903344, 0.0},
		{-0.2249401762805597,  1.0420569547096874, -0.0786360455506319, 0.0},
		{ 0.0000000000000001,  0.0000000000000000,  1.0982736001409661, 0.0},
		{ 0.0,                 0.0,                 0.0,                1.0}
	};

	//	According to WWDC 2016 #606, the A8 GPU and later
	//	use 16-bit register units, so it's best to use
	//	half and short for arithmetic whenever possible.
	//
	//	The A7 GPU is willing to run code written for half and short,
	//	but its computations on short2 and ushort2 are extremely buggy.
	//	To work around those bugs, this legacy function is
	//	just like the GPU function
	//
	//		TorusGamesComputeFunctionMakeJigsawCollages()
	//
	//	given immediately above, but with all 16-bit shorts
	//	converted to 32-bit ints.
	//
	//	The comments given in the non-legacy version
	//	apply equally well here.

	theBlock = aGridID / aCollageLayout.itsApproxBlockStride;

	theSrcCoordsRawPxSigned = int2(aGridID)
							- int2(aCollageLayout.itsInitialOffsetPx
								+  theBlock * aCollageLayout.itsSubsequentOffsetPx);
	
	theSrcCoordsRawPx = ( uint2(theSrcCoordsRawPxSigned) & aCollageLayout.itsCoordinateMask );
	if ( aCollageLayout.itsKleinBottleFlag && (int(theSrcCoordsRawPx.y) != theSrcCoordsRawPxSigned.y) )
		theSrcCoordsRawPx.x = aCollageLayout.itsCoordinateMask - theSrcCoordsRawPx.x;

	theSrcCoordsShiftedPx = theSrcCoordsRawPx + uint2(aCollageLayout.itsPhaseShift);

	theSrcCoordsWrappedPx = ( theSrcCoordsShiftedPx & aCollageLayout.itsCoordinateMask );
	if (aCollageLayout.itsKleinBottleFlag && (theSrcCoordsWrappedPx.y != theSrcCoordsShiftedPx.y))
		theSrcCoordsWrappedPx.x = aCollageLayout.itsCoordinateMask - theSrcCoordsWrappedPx.x;

	theBorderlessCollagePixelColorAsRescaledP3 = aSourceImage.read(theSrcCoordsWrappedPx);
#if defined(__METAL_IOS__)
	theBorderlessCollagePixelColor = gP3toSRGB * theBorderlessCollagePixelColorAsRescaledP3;
#elif defined(__METAL_MACOS__)
	theBorderlessCollagePixelColor = theBorderlessCollagePixelColorAsRescaledP3;
#else
	theBorderlessCollagePixelColor = half4(1.0h, 1.0h, 0.0h, 1.0h);
#endif

	thePuzzleCoords = (float2(theSrcCoordsRawPxSigned) + float2(0.5, 0.5))
					* aCollageLayout.itsPieceWidthsPerPixel;
	
	thePieceCoords = thePuzzleCoords - (float2(0.5, 0.5) + float2(theBlock));
	
	thePieceTabs = aCollageLayout.itsPieceTabs[theBlock.x][theBlock.y];
	thePieceHasTabOnSideH = thePieceTabs[thePieceCoords.x >= 0.0 ? 1 /* east  */ : 0 /* west  */];
	thePieceHasTabOnSideV = thePieceTabs[thePieceCoords.y >= 0.0 ? 3 /* north */ : 2 /* south */];

	h = abs(thePieceCoords.x);
	v = abs(thePieceCoords.y);
	
	hh = uint(h * float(aCollageLayout.itsPieceTemplateSize.x));
	vv = uint(v * float(aCollageLayout.itsPieceTemplateSize.x));

	theTemplateMaxCoords = uint2(
							aCollageLayout.itsPieceTemplateSize.x - uint(1),
							aCollageLayout.itsPieceTemplateSize.y - uint(1));
	
	theTexelCoordsH = uint2(
		min(hh, theTemplateMaxCoords.x),
		min(vv, theTemplateMaxCoords.y));
	if ( ! thePieceHasTabOnSideH )
		theTexelCoordsH.x = theTemplateMaxCoords.x - theTexelCoordsH.x;
	theTexelColorH = aPieceTemplate.read(theTexelCoordsH);
	if (thePieceHasTabOnSideH)
	{
		theTransparencyH	= theTexelColorH.g;
		theBrightnessH		= theTexelColorH.r;
	}
	else
	{
		theTransparencyH	= theTexelColorH.r;
		theBrightnessH		= theTexelColorH.g;
	}

	theTexelCoordsV = uint2(
		min(vv, theTemplateMaxCoords.x),
		min(hh, theTemplateMaxCoords.y));
	if ( ! thePieceHasTabOnSideV )
		theTexelCoordsV.x = theTemplateMaxCoords.x - theTexelCoordsV.x;
	theTexelColorV = aPieceTemplate.read(theTexelCoordsV);
	if (thePieceHasTabOnSideV)
	{
		theTransparencyV	= theTexelColorV.g;
		theBrightnessV		= theTexelColorV.r;
	}
	else
	{
		theTransparencyV	= theTexelColorV.r;
		theBrightnessV		= theTexelColorV.g;
	}
	
	if (theTransparencyH == 0.0 && theTransparencyV == 0.0)
	{
		if (theBrightnessH != 0.0 && theBrightnessV != 0.0)
			theBorderedCollagePixelColor = theBorderlessCollagePixelColor;
		else
			theBorderedCollagePixelColor = half4(0.0h, 0.0h, 0.0h, 1.0h);
	}
	else
	{
		theBorderlessCollagePixelColor	= half4(0.0h, 0.0h, 0.0h, 0.0h);
		theBorderedCollagePixelColor	= half4(0.0h, 0.0h, 0.0h, 0.0h);
	}
	
	aCollageWithoutBorders.write(theBorderlessCollagePixelColor, aGridID);
	aCollageWithBorders.write(theBorderedCollagePixelColor, aGridID);
}
